Awesome things:
library(tidyverse) library(janitor) library(plotly) #less popular since increase with shiny, but still useful! library(factoextra) #used before with biplots for pca library(RColorBrewer)
cluster analyses library(NbClust) library(cluster) library(dendextend) library(ggdendro)
text analyses library(pdftools) library(tidytext) library(wordcloud)
A. check it out
# make column headings look nice using janitor::clean_names()
iris_nice <- iris %>%
clean_names()
ggplot(iris_nice) +
geom_point(aes(x = petal_length, y = petal_width, color = species))
# It's kinda obvious, but, let's ask "How many clusters do you THINK there should be, R?"
# Use NbClust::NbClust
# [1:4] refers to columns
# also set minimum and maximum number of clusters to consider
number_est <- NbClust(iris_nice[1:4], min.nc = 2, max.nc = 10, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# Conclusion: "the best number of clusters is 2"
# But we'll stick with 3 clusters because of what we know about the data
B. Perform k-means, Explore outcome
# specify [columns] and number of clusters (3)
iris_km <- kmeans(iris_nice[1:4], 3)
# how many observations per cluster?
iris_km$size
## [1] 62 38 50
# for each variable, what is the multivariate center location in 4D space?
iris_km$centers
## sepal_length sepal_width petal_length petal_width
## 1 5.901613 2.748387 4.393548 1.433871
## 2 6.850000 3.073684 5.742105 2.071053
## 3 5.006000 3.428000 1.462000 0.246000
# what cluster has each observation been assigned to?
iris_km$cluster
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
# take these cluster assignments, and then put this information in a df with the original data. wow!
iris_cl <- data.frame(iris_nice, cluster_no = factor(iris_km$cluster))
View(iris_cl)
# let's check out how these clusters formed in 2 of the 4 total dimensions:
ggplot(iris_cl) +
geom_point(aes(x = sepal_length, y = sepal_width, color = cluster_no))
# to better conceptualize some of those other dimensions, use "pch=species" in visualization
ggplot(iris_cl) +
geom_point(aes(x = petal_length, y = petal_width, color = cluster_no, pch = species)) +
scale_color_brewer(palette = "Dark2") +
theme_light()
# but if we really want a 3D representation, use plotly! different syntax, here:
# plotly strengths: you can also create your own interactive widgets, and reactive output is maintained in an html file output (knitted)!!
plot_ly(x = iris_cl$petal_length,
y = iris_cl$petal_width,
z = iris_cl$sepal_width,
type = "scatter3d",
color = iris_cl$cluster_no,
symbol = ~iris_cl$species,
marker = list(size = 3),
colors = "Set1")
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
Data: >150 countries,
Note: Very different orders of magnitude of some of these variables. It’s useful, then, to scale data using the scale() function.
A. Wrangle
#remember this syntax for working within R projects
wb_env <- read_csv("wb_env.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## region = col_character(),
## electricity = col_double(),
## agland = col_double(),
## co2 = col_double(),
## methane = col_double(),
## ghg = col_double()
## )
#only keep top 20 ghg emitting countries
#syntax note: dash in "-ghg" refers to the column
wb_ghg_20 <- wb_env %>%
arrange(-ghg) %>%
head(20)
#now scale the data and coerce numerical back to a df
#(since the scale function will automatically store the result as a list)
wb_scaled <- as.data.frame(scale(wb_ghg_20[3:7]))
# Update to add countries as rowNAMES
rownames(wb_scaled) <- wb_ghg_20$name
B. Calculate euclidean distances and do some agglomerative and divisive clustering. Results = dendrograms.
# NOW, compute dissimilarity values (Euclidean distances)!
# from stats::dist
diss <- dist(wb_scaled, method = "euclidean")
# Hierarchical agglomerative clustering (complete linkage)!
# this will create a dendrogram
# feed it the dissimilarity matrix (diss)
# complete is the default method, but write it anyways
hc_complete <- hclust(diss, method = "complete")
# Plot it (base plot):
plot(hc_complete, cex = 0.6, hang = -1)
# Now cluster these divisively - and see some differences in output.
hc_div <- diana(diss)
plot(hc_div)
C. Compare these results, since they differ slightly.
# Convert to class dendrogram
dend1 <- as.dendrogram(hc_complete)
dend2 <- as.dendrogram(hc_div)
# Combine into list
dend_list <- dendlist(dend1,dend2)
#a tanglegram will show differences between dendograms
# if the methods had produced the same results, we would see straight lines across, here
tanglegram(dend1, dend2)
# Convert to class 'dendro' for ggplotting
data1 <- dendro_data(hc_complete)
# Simple plot with ggdendrogram
ggdendrogram(hc_complete,
rotate = TRUE) +
theme_minimal() +
labs(x = "Country")
# Want to do it actually in ggplot? Here:
label_data <- bind_cols(filter(segment(data1), x == xend & x%%1 == 0), label(data1))
ggplot() +
geom_segment(data=segment(data1), aes(x=x, y=y, xend=xend, yend=yend)) +
geom_text(data=label_data, aes(x=xend, y=yend, label=label, hjust=0), size=2) +
coord_flip() +
scale_y_reverse(expand=c(0.2, 0)) +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "None")
pdftools, stringr, tidytext
A. Extract and analyze information from a pdf, using Greta Thunberg’s speech from COP24
#note new syntax in specifying file path
greta <- file.path("greta_thunberg.pdf")
thunberg_text <- pdf_text(greta)
#bring in as a dataframe, but it all shows up in one cell
#can use \n as a delimiter to break things up
#note that 'text' is the column name
thunberg_df <- data.frame(text = thunberg_text) %>%
mutate(text_full = str_split(text, '\\n')) %>%
unnest(text_full)
#now just keep speech text (get rid of headers from pdf)
speech_text <- thunberg_df %>%
select(text_full) %>%
slice(4:18)
#and finally separate out individual words
sep_words <- speech_text %>%
unnest_tokens(word, text_full)
#and count how many times words show up
word_count <- sep_words %>%
count(word, sort=TRUE)
#...but common terms like pronouns that we don't want to analyze are here!
#HAHA! R has built-in lexicons to help us sort these out
#see stop_words documentation
# Remove the stop words
words_stop <- sep_words %>%
anti_join(stop_words)
## Joining, by = "word"
# And we can count them
word_count <- words_stop %>%
count(word, sort = TRUE) # Count words and arrange
word_count
## # A tibble: 65 x 2
## word n
## <chr> <int>
## 1 children 3
## 2 people 3
## 3 speak 3
## 4 care 2
## 5 climate 2
## 6 justice 2
## 7 matter 2
## 8 sacrificed 2
## 9 sweden 2
## 10 15 1
## # ... with 55 more rows
B. More stuff. Example sentiment values from lexicon:
pos_words <- get_sentiments("afinn") %>%
filter(score == 5 | score == 4) %>%
head(20)
pos_words
## # A tibble: 20 x 2
## word score
## <chr> <int>
## 1 amazing 4
## 2 awesome 4
## 3 breathtaking 5
## 4 brilliant 4
## 5 ecstatic 4
## 6 euphoric 4
## 7 exuberant 4
## 8 fabulous 4
## 9 fantastic 4
## 10 fun 4
## 11 funnier 4
## 12 funny 4
## 13 godsend 4
## 14 heavenly 4
## 15 hurrah 5
## 16 lifesaver 4
## 17 lmao 4
## 18 lmfao 4
## 19 masterpiece 4
## 20 masterpieces 4
neutral_words <- get_sentiments("afinn") %>%
filter(between(score, -1,1)) %>%
head(20)
neutral_words
## # A tibble: 20 x 2
## word score
## <chr> <int>
## 1 aboard 1
## 2 absentee -1
## 3 absentees -1
## 4 absorbed 1
## 5 accept 1
## 6 accepted 1
## 7 accepting 1
## 8 accepts 1
## 9 achievable 1
## 10 active 1
## 11 adequate 1
## 12 admit -1
## 13 admits -1
## 14 admitted -1
## 15 adopt 1
## 16 adopts 1
## 17 advanced 1
## 18 affected -1
## 19 afflicted -1
## 20 affronted -1
C. More stuff. Bind some lexicon information to our actual speech words (non stop words)
an aside: when in doubt, use full join, keep everything!
sent_afinn <- words_stop %>%
inner_join(get_sentiments("afinn"))
## Joining, by = "word"
sent_afinn
## word score
## 1 justice 2
## 2 matter 1
## 3 matter 1
## 4 uncomfortable -2
## 5 growth 2
## 6 scared -2
## 7 bad -3
## 8 mess -2
## 9 emergency -2
## 10 mature 2
## 11 burden -2
## 12 leave -1
## 13 care 2
## 14 popular 3
## 15 care 2
## 16 justice 2
## 17 opportunity 2
## 18 rich 2
## 19 pay -1
## 20 celebrate 3
#removed "greta" because no matches, via inner join, with the lexicon
sent_nrc <- words_stop %>%
inner_join(get_sentiments("nrc"))
## Joining, by = "word"
sent_nrc
## word sentiment
## 1 justice positive
## 2 justice trust
## 3 school trust
## 4 uncomfortable negative
## 5 green joy
## 6 green positive
## 7 green trust
## 8 growth positive
## 9 unpopular disgust
## 10 unpopular negative
## 11 unpopular sadness
## 12 talk positive
## 13 forward positive
## 14 bad anger
## 15 bad disgust
## 16 bad fear
## 17 bad negative
## 18 bad sadness
## 19 mess disgust
## 20 mess negative
## 21 pull positive
## 22 emergency fear
## 23 emergency negative
## 24 emergency sadness
## 25 emergency surprise
## 26 leave negative
## 27 leave sadness
## 28 leave surprise
## 29 justice positive
## 30 justice trust
## 31 civilization positive
## 32 civilization trust
## 33 opportunity anticipation
## 34 opportunity positive
## 35 continue anticipation
## 36 continue positive
## 37 continue trust
## 38 money anger
## 39 money anticipation
## 40 money joy
## 41 money positive
## 42 money surprise
## 43 money trust
## 44 luxury joy
## 45 luxury positive
## 46 pay anticipation
## 47 pay joy
## 48 pay positive
## 49 pay trust
## 50 birthday anticipation
## 51 birthday joy
## 52 birthday positive
## 53 birthday surprise
nrc_count <- sent_nrc %>%
group_by(sentiment) %>%
tally()
nrc_count
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 anger 2
## 2 anticipation 5
## 3 disgust 3
## 4 fear 2
## 5 joy 5
## 6 negative 6
## 7 positive 14
## 8 sadness 4
## 9 surprise 4
## 10 trust 8
D. What can we do with these results?
See key for more analyses. But here, let’s build a Word Cloud!
wordcloud(word_count$word,
freq = word_count$n,
min.freq = 1,
max.words = 65,
scale = c(2, 0.1),
colors = brewer.pal(3, "Dark2"))